This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
This project makes use of many packages, especially: DESeq2 from Bioconductor http://bioconductor.org/packages/release/bioc/html/DESeq2.html.
The goal of this document it to provide a reproducible comprehensive overview of data analysis methods
library("checkpoint") #Part of MS R Open to make this software stack more reproducible
library("knitr") #package for report generation; assists with R markdown formatting
checkpoint("2017-07-01", use.knitr = T)
library("dplyr") #package for manipulating data frames
library("ggplot2") #plotting package
library("reshape2") #package for manipulating data frames
library("ggrepel") #add on to ggplot2 for generating labels
library("cowplot") #add on to ggplot2 for building "publication ready plots"
library("viridis") #color palette package
library("broom") #tidy stats outputs
library("kableExtra")
library("DESeq2") #significance testing for RNA-seq data
knitr::opts_chunk$set(cache=TRUE)
theme_set(theme_bw())
set.seed('711')
# DESeq is installed through bioconductor
source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2", suppressUpdates = T)
options(stringsAsFactors = FALSE)
#info of treatments and replicates
info.48 <- read.csv("../data/Condition_Info_HL_48.csv")
info.58 <- read.csv("../data/Condition_Info_HL_58.csv")
#growth curves and GFP-enabled FACS data
grow <- read.csv("../data/Growth.2.csv")
#gene annotation files
an.48 <- read.csv("../data/HL_48_Neat.csv")
an.58 <- read.csv("../data/HL_58_Neat.csv")
#raw gene rollup counts
raw.48 <- read.csv("../data/HL48_Raw_Counts.csv")
raw.58 <- read.csv("../data/HL58_Raw_Counts.csv")
meta <- read.csv("../data/Metabolites.2.csv")
Bulk growth curves measured for each axenic and co-culture treatment the relative abundance of each species in co-culture is reported as the fraction of cells maintaining GFP these fractions were measured via FACS
#GFP.num <- data.frame(grow$OD600*grow$HL58.GFP.frac)
#colnames(GFP.num) <- "HL-58"
#parent.num <- data.frame(grow$OD600*grow$HL48.frac)
#colnames(parent.num) <- "HL-48"
#grow <- cbind(grow, GFP.num, parent.num)
#vars <- c("HL.58.GFP.frac", "parent.num") #try to pull frac values out of data frame
#grow <- cbind(grow[!vars], GFP.num, parent.num) #try to add abs values
t1 <- melt(grow, id.vars = c("Treatment", "Cult", "Species", "Substrate",
"Sample.ID", "Replicate", "Time.h", "GFP.frac", "parent.frac"))
#long format data
p.gro <- ggplot(t1, aes(x = Time.h, y = value, color = variable)) +
geom_point() +
#scale_y_log10() +
facet_grid(Cult ~ Substrate, scales = "free_x") +
theme(strip.background = element_blank(), legend.position = "none") +
geom_smooth(se = F)
p.gro
## `geom_smooth()` using method = 'loess'
t1 %>% dim
t2 <- subset(t1, Time.h > 0)
t2.g <- subset(t2, Substrate=="Glucose" & Time.h < 18)
t2.x <- subset(t2, Substrate=="Xylose" & Time.h < 45)
t2 <- rbind(t2.g, t2.x)
t2 %>% dim
# Let's see if we can add slopes and p-values to these graphs
# See https://stackoverflow.com/questions/17022553/adding-r2-on-graph-with-facets
# Also used on https://github.com/pnnl/bernstein-2017-productivity-and-diversity-2/
df <- subset(t2, Substrate=="Xylose" & Cult=="Axenic" & variable == "HL.48")
df <- subset(t2, Substrate=="Xylose" & Cult=="Axenic" & variable == "Total.OD600")
head(df)
dim(df)
lm_eqn_growth = function(df){
if(all(is.na(df$value))) return("")
# Super important! Return an empty string for the missing values.
m = summary(lm(log(value) ~ Time.h, df)) # Hardcoded to my data
m
m$coefficients
m$coefficients[2] # Slope of Time.h
m$coefficients[8] # P value of Time.h
if(m$coefficients[8] < 0.001) {
outputp <- "0.001"
}else{
outputp <- round(m$coefficients[8], digits = 3)
}
eq <- substitute(
atop("slope ="~slope, "p-value ="~pr), # Two lines
#"slope ="~slope~","~~R^2~"="~r2, # One line
list(slope = signif(m$coefficients[2], digits = 3),
pr = outputp)
)
return(as.character(as.expression(eq)))
}
lm_eqn_growth(df)
eqns <- by(t2, INDICES = list(t2$Substrate, t2$Cult, t2$variable), lm_eqn_growth)
eqns
df2 <-
data.frame(eq = c(eqns),
Substrate = rep(c("Glucose", "Xylose"), 6),
Cult = rep(c("Axenic", "Axenic", "Co-culture","Co-culture"), 3),
variable = c(rep("Total.OD600", 4), rep("HL.58", 4), rep("HL.48", 4))
,graphx = c(10, 10, 8, 20, 14, 35, 14, 22, 8, 20, 8, 20)
,graphy = c(.1, .1, .7, .7, .02, .05, .02, .014, .6, .6, .3, .3)
)
df2
df2 <- subset(df2, eq != "") # Remove the empty lines
p.gro <- ggplot(t2, aes(x = Time.h, y = value, color = variable)) +
geom_point() +
scale_y_log10() +
geom_text(data = df2, aes(x = graphx, y = graphy, label = eq), parse = TRUE, show.legend = F) +
facet_grid(Cult ~ Substrate, scales = "free_x") +
labs(x = "Time (h)", y = expression(log(OD[600])), parse = T) +
scale_color_manual(values = c("#999999", "#6DCD59FF", "#482878")) +
theme(strip.background = element_blank(), legend.position = "none")
p.gro + geom_smooth(method = "lm", fill = "#CCCCCC", show.legend = F)
ggsave("figures/fig1-parts/fig1-growth.pdf", width = 120, units = "mm", height = 90, scale = 1.5)
#### Try it flipped!
# p.gro <- ggplot(t2, aes(x = Time.h, y = value, color = variable)) +
# geom_point() +
# scale_y_log10() +
# geom_text(data = df2, aes(x = graphx, y = graphy, label = eq), parse = TRUE, show.legend = F) +
# facet_grid(Substrate ~ Cult, scales = "free_x") +
# labs(x = "Time (h)", y = expression(log(OD[600])), parse = T) +
# scale_color_manual(values = c("#999999", "#6DCD59FF", "#482878")) +
# theme(strip.background = element_blank(), legend.position = "none")
# p.gro + geom_smooth(method = "lm", fill = "#CCCCCC", show.legend = F)
#
# # Note how "free_x" no longer applied because Time.h must be consistent
# while stacked.
# # Positions of labels also make less sense.
# ggsave("figures/fig1-parts/fig1-growth-flipped.pdf", width = 120, units = "mm", height = 90, scale = 1.5)
Cutaway of one frame from the above figure.
Confirm this goal: Show full time course of one block of the ANOVA design, without dropping timepoints or using a log10() transform.
# Start with the full t1 subset
t3 <- subset(t1, Substrate=="Glucose" & Cult == "Co-culture")
p.gro <- ggplot(t3, aes(x = Time.h, y = value, color = variable)) +
geom_point() +
facet_grid(Cult ~ Substrate, scales = "free_x") +
labs(x = "Time (h)", y = expression(OD[600]), parse = T) +
scale_color_manual(values = c("#999999", "#6DCD59FF", "#482878")) +
theme(strip.background = element_blank(), legend.position = "none")
p.gro + geom_smooth(fill = "#CCCCCC", show.legend = F)
## `geom_smooth()` using method = 'loess'
ggsave("figures/fig1-parts/fig1-growth-cutaway.pdf", width = 60, units = "mm", height = 50, scale = 1.5)
## `geom_smooth()` using method = 'loess'
# All data, for use as a sub figure.
# We add back in the legend, so this does not need the legend in another part of a larger figure.
p.gro <- ggplot(t1, aes(x = Time.h, y = value, color = variable)) +
geom_point() +
facet_grid(Cult ~ Substrate, scales = "free_x") +
labs(x = "Time (h)", y = expression(OD[600]), parse = T) +
scale_color_manual(values = c("#999999", "#6DCD59FF", "#482878"), name = "Treatment") +
theme(strip.background = element_blank(), legend.position = c(.1,.88))
p.gro + geom_smooth(fill = "#CCCCCC", show.legend = F)
## `geom_smooth()` using method = 'loess'
ggsave("figures/fig1-parts/fig1-growth-cutaway-full.pdf", width = 120, units = "mm", height = 90, scale = 1.5)
## `geom_smooth()` using method = 'loess'
NMR metabolomics data derived from culture filtrate
Analyze and compare the concentrations of external metabolites across treatments.
summary(meta)
str(meta)
met <- filter(meta, Metabolite != "DSS-d6 (Chemical Shape Indicator)")
met <- filter(met, Experiment.Date != "Control March 2017")
met <- filter(met, Metabolite != "Glucose")
met <- filter(met, Metabolite != "Xylose")
met$Microbe <- factor(met$Microbe, levels = c("HL-48", "HL-48/58", "HL-58"))
g.met <- ggplot(met, aes(Metabolite, Conc.uM, color = Microbe))
g.met +
geom_boxplot(width = .5) +
labs(y = expression(paste("Concentration (", mu, M, ")")), color = "Treatment: ", x = "") +
facet_wrap(~Substrate, ncol = 2) +
scale_color_manual(values = c("#482878", "#777777", "#6DCD59FF")) +
theme(strip.background = element_blank(), axis.text.x = element_text(angle = -20, hjust=0)
,legend.position = "top", legend.direction = "horizontal"
,plot.margin = margin(0, 1, 0, .5, "lines")
#,legend.position = "none", plot.margin = unit(c(0, 0, 0, .5), "lines")
)
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
# wide; full width of growth curve
ggsave("figures/fig1-parts/fig1-mets-wide1.pdf", width = 120, units = "mm", height = 60, scale = 1.3)
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
# square; half width of growth curve
ggsave("figures/fig1-parts/fig1-mets.pdf", width = 60, units = "mm", height = 50, scale = 1.4)
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
# Try it flipped (matches flipped growth graph)
# g.met +
# geom_boxplot(width = .5) +
# labs(y = expression(paste("Concentration (", mu, M, ")")), color = "Treatment", x = "") +
# facet_wrap(~Substrate, ncol = 1, strip.position = "right") +
# scale_color_manual(values = c("#482878", "#777777", "#6DCD59FF")) +
# theme(strip.background = element_blank(), axis.text.x = element_text(angle = -20, hjust=0))
#
# ggsave("figures/fig1-parts/fig1-mets-flipped.pdf", width = 60, units = "mm", height = 50, scale = 1.4)
We need to do t-test to establish differences between ethanol and acetate abundances between treatments. This can be output as a supplementary table or kable in the markdown. You will see some notes in the Results text corresponding to this as well.
met %>% head
## Treatment Sample.Number Experiment.Date Replicate Metabolite Conc.uM X
## 1 HL-48 G N1 Main Jan 2017 1 Acetate 11.4 G
## 2 HL-48 G N1 Main Jan 2017 1 Ethanol 18.8 G
## 3 HL-48 G N1 Main Jan 2017 1 Formate 11.2 G
## 4 HL-48 G N2 Main Jan 2017 2 Acetate 10.0 G
## 5 HL-48 G N2 Main Jan 2017 2 Ethanol 15.0 G
## 6 HL-48 G N2 Main Jan 2017 2 Formate 12.4 G
## Substrate Cult Microbe
## 1 Glucose Axenic HL-48
## 2 Glucose Axenic HL-48
## 3 Glucose Axenic HL-48
## 4 Glucose Axenic HL-48
## 5 Glucose Axenic HL-48
## 6 Glucose Axenic HL-48
# Thanks to the magic of dplyr, broom, and default stats, we can do this:
met %>% group_by(Substrate, Metabolite) %>%
do(tidy(pairwise.t.test(.$Conc.uM, .$Microbe, p.adj = "holm"))) %>%
kable() %>% kable_styling(full_width = F)
## Currently generic markdown table using pandoc is not supported.
| Substrate | Metabolite | group1 | group2 | p.value |
|---|---|---|---|---|
| Glucose | Acetate | HL-48/58 | HL-48 | 0.0200808 |
| Glucose | Acetate | HL-58 | HL-48 | 0.0006665 |
| Glucose | Acetate | HL-58 | HL-48/58 | 0.0529371 |
| Glucose | Ethanol | HL-48/58 | HL-48 | 0.0000932 |
| Glucose | Ethanol | HL-58 | HL-48 | 0.0010511 |
| Glucose | Ethanol | HL-58 | HL-48/58 | 0.1014061 |
| Glucose | Formate | HL-48/58 | HL-48 | 0.7321162 |
| Glucose | Formate | HL-58 | HL-48 | 0.7321162 |
| Glucose | Formate | HL-58 | HL-48/58 | 0.3372185 |
| Xylose | Acetate | HL-48/58 | HL-48 | 0.2453340 |
| Xylose | Ethanol | HL-48/58 | HL-48 | 0.1623205 |
| Xylose | Formate | HL-48/58 | HL-48 | 0.2733803 |
# Note we are using the 'holm' correction because it's 'uniformly more powerful' then 'bonf'.
# If we want to switch to Bonferroni because it's popular or we want CIs, that's ok too.
# let's pull mean values too
#met %>% group_by(Substrate, Metabolite, Microbe) %>% summarise(mean.Conc.uM = mean(Conc.uM, na.rm = T))
#met %>% group_by(Substrate, Metabolite, Microbe) %>% summarise(med.Conc.uM = median(Conc.uM, na.rm = T))
met %>% group_by(Substrate, Metabolite, Microbe) %>% summarise(med.Conc.uM = median(Conc.uM, na.rm = T)) %>%
filter(Substrate == "Xylose", Metabolite == "Ethanol")
## # A tibble: 2 x 4
## # Groups: Substrate, Metabolite [1]
## Substrate Metabolite Microbe med.Conc.uM
## <chr> <chr> <fctr> <dbl>
## 1 Xylose Ethanol HL-48 5.70
## 2 Xylose Ethanol HL-48/58 4.55
5.7-4.55
## [1] 1.15
(5.7-4.55)/5.7
## [1] 0.2017544
met %>% group_by(Substrate, Metabolite, Microbe) %>% summarise(mean.Conc.uM = mean(Conc.uM, na.rm = T)) %>%
filter(Substrate == "Xylose", Metabolite == "Ethanol")
## # A tibble: 2 x 4
## # Groups: Substrate, Metabolite [1]
## Substrate Metabolite Microbe mean.Conc.uM
## <chr> <chr> <fctr> <dbl>
## 1 Xylose Ethanol HL-48 5.380
## 2 Xylose Ethanol HL-48/58 4.625
5.380-4.625
## [1] 0.755
(5.380-4.625)/5.380
## [1] 0.1403346
# Line 156
met %>% group_by(Substrate, Metabolite, Microbe) %>% summarise(mean.Conc.uM = mean(Conc.uM, na.rm = T)) %>%
filter(Substrate == "Glucose", Metabolite == "Ethanol")
## # A tibble: 3 x 4
## # Groups: Substrate, Metabolite [1]
## Substrate Metabolite Microbe mean.Conc.uM
## <chr> <chr> <fctr> <dbl>
## 1 Glucose Ethanol HL-48 15.16
## 2 Glucose Ethanol HL-48/58 7.22
## 3 Glucose Ethanol HL-58 9.40
15.16/7.22 # subtract 1 to get 'percent increase'
## [1] 2.099723
# line 174
met %>% group_by(Substrate, Metabolite, Microbe) %>% summarise(mean.Conc.uM = mean(Conc.uM, na.rm = T)) %>%
filter(Substrate == "Glucose", Metabolite == "Ethanol")
## # A tibble: 3 x 4
## # Groups: Substrate, Metabolite [1]
## Substrate Metabolite Microbe mean.Conc.uM
## <chr> <chr> <fctr> <dbl>
## 1 Glucose Ethanol HL-48 15.16
## 2 Glucose Ethanol HL-48/58 7.22
## 3 Glucose Ethanol HL-58 9.40
(7.22/9.40) # relative
## [1] 0.7680851
(7.22- 9.40) / 9.4 # subtract 1 to get 'percent increase'
## [1] -0.2319149
Volcano plots!
RNA seq data processing for Halomonas HL-48; generate normalized counts in RPKM
dfcountData <- data.frame(raw.48, row.names = 1)
dfcolData <- data.frame(info.48, row.names = 1)
dds <- DESeqDataSetFromMatrix(countData = dfcountData, colData = dfcolData, design = ~condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds.48 <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
notAllZero <- (rowSums(counts(dds.48)) > 0)
vsd <- varianceStabilizingTransformation(dds.48)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
HL48.norm.count <- data.frame(assay(vsd[notAllZero,]))
HL48.norm.count <- add_rownames(HL48.norm.count, "GeneID")
## Warning: Deprecated, use tibble::rownames_to_column() instead.
HL48.norm.count <- merge(an.48, HL48.norm.count, by = "GeneID")
write.table(HL48.norm.count, file="RNA_seq_outputs/HL48_Norm_Exp_Values_annotated.csv", quote=FALSE, sep=",", row.names=FALSE, col.names=TRUE)
kable(head(HL48.norm.count))
| GeneID | Acc_num | Main_Role | Subrole | Product | Gene | T1_HL48G | T2_HL48G | T4_HL48G | T5_HL48G | T11_HL48G | T12_HL48G | T13_HL48G | T14_HL48G | T15_HL48G | T16_HL48X | T17_HL48X | T18_HL48X | T19_HL48X | T21_HL48X | T22_HL48X | T23_HL48X | T24_HL48X |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| hotlake_ucc_124538 | CY41DRAFT_0125 | Nucleic acid metabolism | DNA replication_ recombination_ and repair | DNA repair protein radc | -5.7848388 | -6.3143024 | -4.4814389 | -4.5377806 | -4.5265101 | -4.8708085 | -5.510563 | -4.6542463 | -5.3449012 | -6.0842219 | -3.9634773 | -6.5836009 | -5.8550017 | -5.5382288 | -5.2826004 | -5.4470966 | -5.0667479 | |
| hotlake_ucc_124539 | CY41DRAFT_0132 | Nucleic acid metabolism | DNA replication_ recombination_ and repair | Nucleotidyltransferase/DNA polymerase involved in DNA repair | 4.9963436 | 4.7277389 | 4.7600992 | 6.3692662 | 7.2207782 | 5.0379955 | 6.861618 | 6.0918492 | 5.2610346 | 9.1327328 | 9.3193559 | 9.1138284 | 9.3900502 | 9.1654333 | 9.0195327 | 9.1943398 | 4.6304036 | |
| hotlake_ucc_124540 | CY41DRAFT_0143 | Mobile and extrachromosomal element functions | Selfish genetic elements | IS3 family transposase | 1.8254382 | 1.9322351 | 0.8025498 | 3.1816689 | 3.8016526 | 3.3107449 | 3.334614 | 3.0542134 | 1.7957520 | 0.5970179 | -0.2134157 | 0.0705615 | 0.0969528 | 0.9398863 | 0.6832929 | 0.9688571 | 2.4263460 | |
| hotlake_ucc_124541 | CY41DRAFT_0145 | transcriptional regulator-like protein | 0.0961810 | 0.0142558 | -1.6310959 | 0.9014977 | -0.7069261 | 0.3712687 | 1.719177 | 0.2093218 | -0.2096073 | 1.5702723 | 0.9160743 | 1.9851819 | 0.7020047 | 2.3341311 | 1.7835076 | 1.1339884 | 0.6228251 | |||
| hotlake_ucc_124542 | CY41DRAFT_0180 | hypothetical protein | 0.0081833 | -0.1290377 | -0.5916545 | -1.1555539 | -1.8462097 | 0.5030828 | -1.057093 | -2.0216193 | 1.2537645 | -2.0979835 | -1.3978755 | -1.2436046 | -3.7383752 | -1.7227944 | -1.8882386 | -1.7831942 | 1.7534734 | |||
| hotlake_ucc_124543 | CY41DRAFT_0217 | transposase | -3.2836200 | -2.3520823 | -0.8251749 | -2.6913921 | -1.0130258 | -2.0514015 | -2.805127 | 0.4255340 | -1.2107125 | -3.6241258 | -4.7173176 | -2.2823046 | -2.8908062 | -1.9026651 | -4.4514642 | -3.1411675 | -2.9460297 |
# HL48.norm.count %>% head
# # So this simple df includes gene annotations and is normalized...
# dds.48 %>% assay() %>% head
# # ...while this is the full object, without normalization.
colData(dds.48)$condition %>% table
## .
## HL_48_58_G HL_48_58_X HL_48_G HL_48_X
## 5 4 4 4
General function for DESeq contrasts.
deseq_diff <- function(df, contrasts, annotations){
# perform deseq contrast
r <- results(df, contrasts)
# Convert to table and relablel row names to GeneID
r <- add_rownames(data.frame(r), "GeneID")
# merge in annotations from file
r.an <- merge(annotations, r, by = "GeneID")
return(r.an)
}
generate differential expression output for HL-48 glucose competition treatment HL-48 glucose axenic vs. HL-58/HL-48 glucose co-culture
HL48.diff.comp <- deseq_diff(dds.48, c("condition","HL_48_58_G","HL_48_G"), annotations = an.48)
write.table(HL48.diff.comp, file="RNA_seq_outputs/HL48.diff.comp.csv", quote=FALSE, sep=",", row.names=FALSE, col.names=TRUE)
kable(head(HL48.diff.comp))
| GeneID | Acc_num | Main_Role | Subrole | Product | Gene | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|---|---|---|---|---|
| hotlake_ucc_124538 | CY41DRAFT_0125 | Nucleic acid metabolism | DNA replication_ recombination_ and repair | DNA repair protein radc | 9.761608 | 0.2435938 | 0.4007812 | 0.6077975 | 0.5433218 | 0.8554307 | |
| hotlake_ucc_124539 | CY41DRAFT_0132 | Nucleic acid metabolism | DNA replication_ recombination_ and repair | Nucleotidyltransferase/DNA polymerase involved in DNA repair | 800.375384 | 0.3960024 | 0.3193219 | 1.2401357 | 0.2149252 | 0.6491502 | |
| hotlake_ucc_124540 | CY41DRAFT_0143 | Mobile and extrachromosomal element functions | Selfish genetic elements | IS3 family transposase | 138.723430 | 0.4380131 | 0.2171244 | 2.0173366 | 0.0436604 | 0.3363009 | |
| hotlake_ucc_124541 | CY41DRAFT_0145 | transcriptional regulator-like protein | 99.795097 | 0.1635539 | 0.2409007 | 0.6789266 | 0.4971844 | 0.8371530 | |||
| hotlake_ucc_124542 | CY41DRAFT_0180 | hypothetical protein | 60.185987 | 0.0298161 | 0.3886594 | 0.0767153 | 0.9388500 | 0.9854406 | |||
| hotlake_ucc_124543 | CY41DRAFT_0217 | transposase | 36.230425 | 0.5619926 | 0.3528957 | 1.5925176 | 0.1112685 | 0.5081949 |
generate differential expression output for HL-48 xylose commensalism treatment HL-48 xylose axenic vs. HL-58/HL-48 xylose co-culture
HL48.diff.cmns <- deseq_diff(dds.48, c("condition","HL_48_58_X","HL_48_X"), an.48)
write.table(HL48.diff.cmns, file="RNA_seq_outputs/HL48.diff.cmns.csv", quote=FALSE, sep=",", row.names=FALSE, col.names=TRUE)
kable(head(HL48.diff.cmns))
| GeneID | Acc_num | Main_Role | Subrole | Product | Gene | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|---|---|---|---|---|
| hotlake_ucc_124538 | CY41DRAFT_0125 | Nucleic acid metabolism | DNA replication_ recombination_ and repair | DNA repair protein radc | 9.761608 | 0.0039493 | 0.4815792 | 0.0082007 | 0.9934569 | 0.9963762 | |
| hotlake_ucc_124539 | CY41DRAFT_0132 | Nucleic acid metabolism | DNA replication_ recombination_ and repair | Nucleotidyltransferase/DNA polymerase involved in DNA repair | 800.375384 | -0.3664611 | 0.3343952 | -1.0958924 | 0.2731259 | 0.5124676 | |
| hotlake_ucc_124540 | CY41DRAFT_0143 | Mobile and extrachromosomal element functions | Selfish genetic elements | IS3 family transposase | 138.723430 | 0.5323011 | 0.2499474 | 2.1296527 | 0.0332003 | 0.1327357 | |
| hotlake_ucc_124541 | CY41DRAFT_0145 | transcriptional regulator-like protein | 99.795097 | 0.1050630 | 0.2529892 | 0.4152866 | 0.6779321 | 0.8371175 | |||
| hotlake_ucc_124542 | CY41DRAFT_0180 | hypothetical protein | 60.185987 | 0.7602655 | 0.4311079 | 1.7635158 | 0.0778135 | 0.2392591 | |||
| hotlake_ucc_124543 | CY41DRAFT_0217 | transposase | 36.230425 | 0.2275172 | 0.4146091 | 0.5487511 | 0.5831763 | 0.7759768 |
generate differential expression output for HL-48 competition over commensalism HL-58/HL-48 xylose co-culture (cmns) vs. HL-58/HL-48 glucose co-culture (comp)
# New plot comparing treatments
HL48.diff.coculture <- deseq_diff(dds.48, c("condition","HL_48_58_G", "HL_48_58_X"), an.48)
write.table(HL48.diff.coculture, file="RNA_seq_outputs/HL48.diff.coculture.csv", quote=FALSE, sep=",", row.names=FALSE, col.names=TRUE)
kable(head(HL48.diff.coculture))
| GeneID | Acc_num | Main_Role | Subrole | Product | Gene | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|---|---|---|---|---|
| hotlake_ucc_124538 | CY41DRAFT_0125 | Nucleic acid metabolism | DNA replication_ recombination_ and repair | DNA repair protein radc | 9.761608 | 0.2443369 | 0.3846645 | 0.6351949 | 0.5253013 | 0.6083656 | |
| hotlake_ucc_124539 | CY41DRAFT_0132 | Nucleic acid metabolism | DNA replication_ recombination_ and repair | Nucleotidyltransferase/DNA polymerase involved in DNA repair | 800.375384 | -1.0727827 | 0.3176367 | -3.3773892 | 0.0007318 | 0.0019697 | |
| hotlake_ucc_124540 | CY41DRAFT_0143 | Mobile and extrachromosomal element functions | Selfish genetic elements | IS3 family transposase | 138.723430 | 0.7620211 | 0.2151973 | 3.5410352 | 0.0003986 | 0.0011479 | |
| hotlake_ucc_124541 | CY41DRAFT_0145 | transcriptional regulator-like protein | 99.795097 | -0.4921529 | 0.2333701 | -2.1088944 | 0.0349537 | 0.0614616 | |||
| hotlake_ucc_124542 | CY41DRAFT_0180 | hypothetical protein | 60.185987 | 0.0664750 | 0.3848789 | 0.1727166 | 0.8628742 | 0.8995962 | |||
| hotlake_ucc_124543 | CY41DRAFT_0217 | transposase | 36.230425 | 0.9260233 | 0.3489880 | 2.6534531 | 0.0079673 | 0.0170592 |
RNA seq data processing for Marinobacter HL-58; generate normalized counts in RPKM
dfcountData <- data.frame(raw.58, row.names = 1)
dfcolData <- data.frame(info.58, row.names = 1)
dds.58 <- DESeqDataSetFromMatrix(countData = dfcountData, colData = dfcolData, design = ~condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds.58 <- DESeq(dds.58)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
notAllZero <- (rowSums(counts(dds.58)) > 0)
vsd <- varianceStabilizingTransformation(dds.58)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
HL58.norm.count <- data.frame(assay(vsd[notAllZero,]))
HL58.norm.count <- add_rownames(HL58.norm.count, "GeneID")
## Warning: Deprecated, use tibble::rownames_to_column() instead.
HL58.norm.count <- merge(an.58, HL58.norm.count, by = "GeneID")
write.table(HL58.norm.count, file="RNA_seq_outputs/HL58_Norm_Exp_Values_annotated.csv", quote=FALSE, sep=",", row.names=FALSE, col.names=TRUE)
kable(head(HL58.norm.count))
| GeneID | Acc_num | Main_Role | Subrole | Product | Gene | T6_HL58G | T7_HL58G | T8_HL58G | T9_HL58G | T10_HL58G | T11_HL58G | T12_HL58G | T13_HL58G | T14_HL58G | T15_HL58G | T21_HL58X | T22_HL58X | T23_HL58X | T24_HL58X |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| hotlake_ucc_124834 | CD01DRAFT_3221 | Glycan biosynthesis and metabolism | Peptidoglycan metabolism | UDP-N-acetylmuramate dehydrogenase | 11.3358256 | 11.1800462 | 11.5913774 | 12.6287175 | 12.2522154 | 10.6245245 | 11.458865 | 10.8227724 | 11.3105664 | 11.868815 | 10.837703 | 10.8722827 | 10.9260847 | 9.8385927 | |
| hotlake_ucc_124835 | CD01DRAFT_3256 | Xenobiotics biodegradation and metabolism | Benzoate degradation | acetyl-CoA acetyltransferase | 0.2893289 | 0.0618485 | 0.2695364 | -0.2584539 | 0.0814818 | 1.6733887 | 3.072722 | 1.2447668 | 1.4060076 | 3.627405 | 2.325894 | 2.8275946 | 2.6694170 | 0.6611770 | |
| hotlake_ucc_80896 | CD01DRAFT_0003 | Regulatory functions::Signal transduction::Unknown function | Enzymes of unknown specificity::Two-component systems::Taxis::Small molecule interactions | two component signal transduction system histidine kinase | -3.9207873 | -3.8945012 | -3.5862830 | -4.0033188 | -4.8527091 | -4.2900475 | -5.839016 | -5.7183508 | -5.5616983 | -6.397703 | -6.397703 | -6.3977031 | -6.3977031 | -4.6529033 | |
| hotlake_ucc_80898 | CD01DRAFT_0005 | outer membrane porin | -0.3121074 | -1.1220738 | -0.0941957 | 1.3831640 | -0.9062989 | 0.7489158 | -1.807176 | -0.8968551 | 0.1051990 | -1.515550 | 7.121093 | 7.8805851 | 7.4294346 | 9.2445007 | |||
| hotlake_ucc_80900 | CD01DRAFT_0006 | Unknown function | Enzymes of unknown specificity | Diacylglycerol O-acyltransferase | 0.1854444 | 0.3224793 | 0.0633558 | 0.0108127 | -0.4184039 | 0.5333536 | 1.387705 | -0.2424177 | 0.3700958 | 1.415378 | 2.325894 | -0.3955064 | -0.7815176 | 0.3493803 | |
| hotlake_ucc_80901 | CD01DRAFT_0007 | hypothetical protein | 7.5393210 | 7.6051410 | 7.1777729 | 8.2342630 | 7.5543323 | 10.3865789 | 8.438497 | 10.8655177 | 10.2467437 | 8.178904 | 10.794648 | 11.1480828 | 11.0841307 | 10.6393807 |
# HL58.norm.count %>% head
# # So this simple df includes gene annotations and is normalized...
# dds.58 %>% assay() %>% head
# # ...while this is the full object, without normalization. Just like last time
colData(dds.58)$condition %>% table
## .
## HL_48_58_G HL_48_58_X HL_58_G
## 5 4 5
more RNA seq data processing for Marinobacter HL-58; generate differential expression output for HL-58 glucose competition treatment HL-58 glucose axenic vs. HL-58/HL-48 glucose co-culture note that HL-58 does not grow on xylose; hence, differential expression cannot be analyzed for HL-58 xylose commensalism
HL58.diff.comp <- deseq_diff(dds.58, c("condition","HL_48_58_G","HL_58_G"), an.58)
write.table(HL58.diff.comp, file="RNA_seq_outputs/HL58.diff.comp.csv", quote=FALSE, sep=",", row.names=TRUE, col.names=TRUE)
kable(head(HL58.diff.comp))
| GeneID | Acc_num | Main_Role | Subrole | Product | Gene | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|---|---|---|---|---|
| hotlake_ucc_124834 | CD01DRAFT_3221 | Glycan biosynthesis and metabolism | Peptidoglycan metabolism | UDP-N-acetylmuramate dehydrogenase | 2542.987859 | -0.3681853 | 0.2064011 | -1.7838339 | 0.0744507 | 0.1183062 | |
| hotlake_ucc_124835 | CD01DRAFT_3256 | Xenobiotics biodegradation and metabolism | Benzoate degradation | acetyl-CoA acetyltransferase | 100.671202 | 1.1086581 | 0.2105873 | 5.2646020 | 0.0000001 | 0.0000008 | |
| hotlake_ucc_80896 | CD01DRAFT_0003 | Regulatory functions::Signal transduction::Unknown function | Enzymes of unknown specificity::Two-component systems::Taxis::Small molecule interactions | two component signal transduction system histidine kinase | 3.303432 | -2.8295761 | 0.6983407 | -4.0518561 | 0.0000508 | 0.0001784 | |
| hotlake_ucc_80897 | CD01DRAFT_0004 | COG3287 family protein of unknown function | 0.000000 | NA | NA | NA | NA | NA | |||
| hotlake_ucc_80898 | CD01DRAFT_0005 | outer membrane porin | 247.862849 | -0.2903843 | 0.3103231 | -0.9357484 | 0.3494028 | 0.4339249 | |||
| hotlake_ucc_80900 | CD01DRAFT_0006 | Unknown function | Enzymes of unknown specificity | Diacylglycerol O-acyltransferase | 67.393844 | 0.3850037 | 0.1726139 | 2.2304326 | 0.0257187 | 0.0469052 |
generate differential expression output for HL-58 competition over commensalism HL-58/HL-48 xylose co-culture (cmns) vs. HL-58/HL-48 glucose co-culture (comp)
# new graph comparing two treatments
HL58.diff.coculture <- deseq_diff(dds.58, c("condition","HL_48_58_G","HL_48_58_X"), an.58)
write.table(HL58.diff.coculture, file="RNA_seq_outputs/HL58.diff.coculture.csv", quote=FALSE, sep=",", row.names=TRUE, col.names=TRUE)
kable(head(HL58.diff.coculture))
| GeneID | Acc_num | Main_Role | Subrole | Product | Gene | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|---|---|---|---|---|
| hotlake_ucc_124834 | CD01DRAFT_3221 | Glycan biosynthesis and metabolism | Peptidoglycan metabolism | UDP-N-acetylmuramate dehydrogenase | 2542.987859 | 0.3579844 | 0.2244110 | 1.5952182 | 0.1106635 | 0.1751276 | |
| hotlake_ucc_124835 | CD01DRAFT_3256 | Xenobiotics biodegradation and metabolism | Benzoate degradation | acetyl-CoA acetyltransferase | 100.671202 | 0.1746891 | 0.2913066 | 0.5996743 | 0.5487233 | 0.6405866 | |
| hotlake_ucc_80896 | CD01DRAFT_0003 | Regulatory functions::Signal transduction::Unknown function | Enzymes of unknown specificity::Two-component systems::Taxis::Small molecule interactions | two component signal transduction system histidine kinase | 3.303432 | -1.3303797 | 1.2096676 | -1.0997895 | 0.2714238 | 0.3704850 | |
| hotlake_ucc_80897 | CD01DRAFT_0004 | COG3287 family protein of unknown function | 0.000000 | NA | NA | NA | NA | NA | |||
| hotlake_ucc_80898 | CD01DRAFT_0005 | outer membrane porin | 247.862849 | -4.0592778 | 0.3342160 | -12.1456716 | 0.0000000 | 0.0000000 | |||
| hotlake_ucc_80900 | CD01DRAFT_0006 | Unknown function | Enzymes of unknown specificity | Diacylglycerol O-acyltransferase | 67.393844 | 0.0106930 | 0.2899656 | 0.0368768 | 0.9705832 | 0.9780839 |
generate differential expression output for HL-58 xylose commensalism over glucose axenic HL-58 glucose axenic vs. HL-58/HL-48 xylose co-culture
Because HL-58 does not grow on xylose, but can survive in the presence of HL-48, this is used as a proxy for the impossible HL_58_X. Note that two treatments are applied (different substrate and introduction of HL-48).
# new graph control to two treatments
HL58.diff.proxy <- deseq_diff(dds.58, c("condition","HL_48_58_X","HL_58_G"), an.58)
write.table(HL58.diff.proxy, file="RNA_seq_outputs/HL58.diff.proxy.csv", quote=FALSE, sep=",", row.names=TRUE, col.names=TRUE)
kable(head(HL58.diff.proxy))
| GeneID | Acc_num | Main_Role | Subrole | Product | Gene | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|---|---|---|---|---|
| hotlake_ucc_124834 | CD01DRAFT_3221 | Glycan biosynthesis and metabolism | Peptidoglycan metabolism | UDP-N-acetylmuramate dehydrogenase | 2542.987859 | -0.7261697 | 0.2241244 | -3.240030 | 0.0011952 | 0.0034253 | |
| hotlake_ucc_124835 | CD01DRAFT_3256 | Xenobiotics biodegradation and metabolism | Benzoate degradation | acetyl-CoA acetyltransferase | 100.671202 | 0.9339691 | 0.2889583 | 3.232193 | 0.0012284 | 0.0035027 | |
| hotlake_ucc_80896 | CD01DRAFT_0003 | Regulatory functions::Signal transduction::Unknown function | Enzymes of unknown specificity::Two-component systems::Taxis::Small molecule interactions | two component signal transduction system histidine kinase | 3.303432 | -1.4991964 | 1.0699322 | -1.401207 | 0.1611522 | 0.2438791 | |
| hotlake_ucc_80897 | CD01DRAFT_0004 | COG3287 family protein of unknown function | 0.000000 | NA | NA | NA | NA | NA | |||
| hotlake_ucc_80898 | CD01DRAFT_0005 | outer membrane porin | 247.862849 | 3.7688935 | 0.3242340 | 11.623992 | 0.0000000 | 0.0000000 | |||
| hotlake_ucc_80900 | CD01DRAFT_0006 | Unknown function | Enzymes of unknown specificity | Diacylglycerol O-acyltransferase | 67.393844 | 0.3743107 | 0.2844672 | 1.315831 | 0.1882308 | 0.2780080 |
setup volcano plot for HL48 glucose competition this analysis barrows ideas/code from a previously described example source https://twbattaglia.github.io/2016/12/17/volcano-plot/
add_color_cutoff <- function(df, pvalue_cutoff = -log10(0.05), up_name, down_name, default_name = "None"){
# This function is hard-coded to our data. It's not meant to be a general function.
df$color <- default_name
df$color[df$lfc > 0 & df$pvalue > pvalue_cutoff] <- up_name
df$color[df$lfc < 0 & df$pvalue > pvalue_cutoff] <- down_name
return(df)
}
# https://stackoverflow.com/questions/7367138/text-wrap-for-plot-titles
wrap_strings <- function(x, goal_width = 40){
as.character(sapply(x,FUN=function(x){
if(is.na(x)){return("")}
if(nchar(x) == 0){return("")}
print((nchar(x) / (ceiling(nchar(x)/goal_width))))
paste(strwrap(x, width = (nchar(x) / (ceiling(nchar(x)/goal_width)))), collapse=" \n")
}))
}
# Test:
c("test", "", NA) %>% wrap_strings()
## [1] 4
## [1] "test" "" ""
# Also use it on full gp.labels
gp.labels$Product <- wrap_strings(gp.labels$Product)
## [1] 29
## [1] 33
## [1] 31.5
## [1] 31.5
## [1] 18
## [1] 33
## [1] 21
## [1] 24.5
## [1] 24.5
## [1] 26
## [1] 31.5
## [1] 40
## [1] 33
## [1] 25
## [1] 25.5
## [1] 26.5
## [1] 40
## [1] 21
## [1] 23
## [1] 22.5
## [1] 20.5
## [1] 38
## [1] 23
## [1] 37
## [1] 35
## [1] 35
plot_v <- function(df, df.labels){
# This function is hard-coded to our data. It's not meant to be a general function.
plot <- ggplot(df, aes(x = lfc, y = pvalue))
return(plot +
#geom_vline(xintercept = 0, color = "black") + # add line at 0
geom_vline(xintercept = c(-log2(2),log2(2)), color = "grey40") + # Add cutoffs
geom_hline(yintercept = -log10(0.05), color = "grey40") + # we put our pvalue cutoff in here
geom_point(aes(color = factor(color)), size = 2, alpha = 0.5, na.rm = TRUE) +
theme(legend.position = "none") + # remove legend
# We let's add these manually so they match the different graph
#annotate("text", x = -2, y = 0, label = "Axenic", size = 5, color = "black") + # add Untreated text
#annotate("text", x = 2, y = 0, label = "Co-culture", size = 5, color = "red") + # add Treated text
#xlab(expression(log[2]("Co-culture" / "Axenic"))) + # x-axis label
ylab(expression(-log[10]("adjusted p-value"))) + # y-axis label
scale_y_continuous(trans = "log1p") + #transform the y-axis
scale_color_manual(values = c("Co-culture" = viridis(10, option = "C")[8],
"Axenic" = viridis(10, option = "C")[1],
"None" = "grey")) + # We could add new colors here for Commensalism and Competition plots.
geom_text_repel(data = df.labels, lineheight = 0.8,
mapping = aes(label = Product), min.segment.length = unit(0, "lines")
,box.padding = unit(0.2, "lines"), point.padding = unit(0.2, "lines")
)
)
}
Fig2-A, Colored by directionality
Colored by directionality
Colored by directionality
## [1] 23
## [1] 32.5
## [1] 31
## [1] 19
## [1] 38.5
## [1] 20
## [1] 34
## [1] 35
## [1] 22
## [1] 34.5
## Warning: Removed 2 rows containing missing values (geom_text_repel).
## [1] 24
## [1] 34
## [1] 19
## [1] 23
## [1] 27
## [1] 16
## [1] 26.5
## [1] 32.5
## [1] 23
## [1] 37
## [1] 38
## [1] 38.5
## [1] 26.5
## [1] 32.5
## [1] 23
## [1] 24
## [1] 22
## [1] 19
## [1] 23
## [1] 27
Dot plots!
This section applies filters to differentially expressed genes and calculates those gene function categories that are statistically enriched from the genome of each species
#trim differentially expressed genes by a pvalue cutoff p-adjusted <= 0.05
HL48.diff.comp.filt <- HL48.diff.comp %>% subset(padj <= 0.05)
HL48.diff.cmns.filt <- HL48.diff.cmns %>% subset(padj <= 0.05)
HL58.diff.comp.filt <- HL58.diff.comp %>% subset(padj <= 0.05)
# fold changes that are significant
HL58.diff.comp.filt$log2FoldChange %>% qplot() + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#trim differentially expressed genes by fold change; FC greater than 2 (< log2 = -1 or > log2 = 1)
HL48.diff.comp.filtU <- HL48.diff.comp.filt %>% subset(log2FoldChange >= 1)
HL48.diff.cmns.filtU <- HL48.diff.cmns.filt %>% subset(log2FoldChange >= 1)
HL58.diff.comp.filtU <- HL58.diff.comp.filt %>% subset(log2FoldChange >= 1)
# HL48.diff.comp.filt <- HL48.diff.comp.filt %>% subset(abs(log2FoldChange) >= 1)
# HL48.diff.cmns.filt <- HL48.diff.cmns.filt %>% subset(abs(log2FoldChange) >= 1)
# HL58.diff.comp.filt <- HL58.diff.comp.filt %>% subset(abs(log2FoldChange) >= 1)
HL48.diff.comp.filtD <- HL48.diff.comp.filt %>% subset(log2FoldChange <= -1)
HL48.diff.cmns.filtD <- HL48.diff.cmns.filt %>% subset(log2FoldChange <= -1)
HL58.diff.comp.filtD <- HL58.diff.comp.filt %>% subset(log2FoldChange <= -1)
# HL58.diff.comp.filtU$log2FoldChange %>% qplot() + geom_histogram() # all up
# HL58.diff.comp.filtD$log2FoldChange %>% qplot() + geom_histogram() # all down
# fold changes that are significant and OVER 2 (but not under). So this capture enrichment only.
HL58.diff.comp.filt$log2FoldChange %>% qplot() + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#prepare input files for FE
HL48.comp.FE.U <- HL48.diff.comp.filtU %>% select(GeneID) %>% data.frame(., ModuleID = "HL_48_58_G_v_HL_48_G")
HL48.cmns.FE.U <- HL48.diff.cmns.filtU %>% select(GeneID) %>% data.frame(., ModuleID = "HL_48_58_X_v_HL_48_X")
HL58.comp.FE.U <- HL58.diff.comp.filtU %>% select(GeneID) %>% data.frame(., ModuleID = "HL_48_58_G_v_HL_58_G")
HL48.comp.FE.D <- HL48.diff.comp.filtD %>% select(GeneID) %>% data.frame(., ModuleID = "HL_48_58_G_v_HL_48_G")
HL48.cmns.FE.D <- HL48.diff.cmns.filtD %>% select(GeneID) %>% data.frame(., ModuleID = "HL_48_58_X_v_HL_48_X")
HL58.comp.FE.D <- HL58.diff.comp.filtD %>% select(GeneID) %>% data.frame(., ModuleID = "HL_48_58_G_v_HL_58_G")
FE calculations of main role categories
#first, set up the subrole FE function
#input x is a FE input e.g. HL48.comp.FE
#input y is an annotation file e.g. an.48
mainroleeModuleEnrichment <- function(x,y)
{
y[grepl('::', y$Main_Role), 'Main_Role'] <- 'Ambiguous_Function'
fModuleData <- x
fAnnotData <- y[, c("GeneID", "Main_Role")]
colnames(fAnnotData) <- c("GeneID", "Main_Role")
uniqueFunCats <- unique(fAnnotData[c("Main_Role")])
Main_Role <- unique(fAnnotData$Main_Role)
modules <- unique(fModuleData$ModuleID)
numGenesInGenome <- nrow(fAnnotData)
outputData <- NULL
for (mID in modules)
{
genesInSet <- fModuleData[fModuleData$ModuleID == mID, "GeneID"]
numGenesInSet <- length(genesInSet)
for (i in 1:nrow(uniqueFunCats))
{
Main_Role <- uniqueFunCats[i, "Main_Role"]
genesInGenomeWithAnnot <- fAnnotData[fAnnotData$Main_Role == Main_Role, "GeneID"]
numGenesInGenomeWithAnnot <- length(genesInGenomeWithAnnot)
numGenesInSetWithAnnot <- length(intersect(genesInSet, genesInGenomeWithAnnot))
#=====================================================================================
# Run Fisher's exact test
counts <- matrix(c(numGenesInSetWithAnnot, numGenesInSet-numGenesInSetWithAnnot,
numGenesInGenomeWithAnnot, numGenesInGenome-numGenesInGenomeWithAnnot), nrow=2)
res <- fisher.test(counts)
if (res$p.value <= 0.05)
{
pModule <- numGenesInSetWithAnnot/numGenesInSet
pGenome <- numGenesInGenomeWithAnnot/numGenesInGenome
ratio <- pModule/pGenome
if (pModule > pGenome)
{
outputData <- rbind(outputData, cbind(ModuleID=mID, Main_Role=Main_Role, PVal=res$p.value, Ratio=ratio, PercentageInModule=pModule, PercentageInGenome=pGenome))
}
}
}
}
outputData <- data.frame(outputData)
outputData
}
#mainrolefunctial enrichment function using HL48.comp.FE and an.48 as x and y inputs
HL48.comp.MR.FE.U <- mainroleeModuleEnrichment(HL48.comp.FE.U, an.48)
HL48.comp.MR.FE.U$Treatment <- "HL-48 Glucose\nCompetition"
HL48.comp.MR.FE.U$dir <- "Increase"
#mainrolefunctial enrichment function using HL48.cmns.FE and an.48 as x and y inputs
HL48.cmns.MR.FE.U <- mainroleeModuleEnrichment(HL48.cmns.FE.U, an.48)
HL48.cmns.MR.FE.U$Treatment <- "HL-48 Xylose\nCommensalism"
HL48.cmns.MR.FE.U$dir <- "Increase"
#mainrolefunctial enrichment function using HL58.comp.FE and an.58 as x and y inputs
HL58.comp.MR.FE.U <- mainroleeModuleEnrichment(HL58.comp.FE.U, an.58)
HL58.comp.MR.FE.U$Treatment <- 'HL-58 Glucose\nCompetition'
HL58.comp.MR.FE.U$dir <- "Increase"
# matching set for reduced (D Down) enrichment
HL48.comp.MR.FE.D <- mainroleeModuleEnrichment(HL48.comp.FE.D, an.48)
HL48.comp.MR.FE.D$Treatment <- 'HL-48 Glucose\nCompetition'
HL48.comp.MR.FE.D$dir <- "Decrease"
# Empty?
HL48.cmns.MR.FE.D <- mainroleeModuleEnrichment(HL48.cmns.FE.D, an.48)
HL48.cmns.MR.FE.D$Treatment <- 'HL-48 Xylose\nCommensalism'
HL48.cmns.MR.FE.D$dir <- "Decrease"
HL58.comp.MR.FE.D <- mainroleeModuleEnrichment(HL58.comp.FE.D, an.58)
HL58.comp.MR.FE.D$Treatment <- 'HL-58 Glucose\nCompetition'
HL58.comp.MR.FE.D$dir <- "Decrease"
#pull it together
MR.FE <- rbind(HL48.comp.MR.FE.U, HL48.cmns.MR.FE.U, HL58.comp.MR.FE.U,
HL48.comp.MR.FE.D, # This is the empty data frame
HL48.cmns.MR.FE.D, HL58.comp.MR.FE.D)
kable(MR.FE, caption = "Functional enrichment of main role gene categories")
| ModuleID | Main_Role | PVal | Ratio | PercentageInModule | PercentageInGenome | Treatment | dir |
|---|---|---|---|---|---|---|---|
| HL_48_58_G_v_HL_48_G | Transport and binding proteins | 0.0247849323290425 | 4.3644578313253 | 0.428571428571429 | 0.0981958000591541 | HL-48 Glucose | |
| Competition Increase | |||||||
| HL_48_58_X_v_HL_48_X | Cell structure_ growth_ and death | 0.000621398637458999 | 5.10504745470233 | 0.0921052631578947 | 0.018041999408459 | HL-48 Xylose | |
| Commensalism Increase | |||||||
| HL_48_58_X_v_HL_48_X | Energy metabolism | 0.00765840961416274 | 3.60704125177809 | 0.0789473684210526 | 0.0218870156758355 | HL-48 Xylose | |
| Commensalism Increase | |||||||
| HL_48_58_X_v_HL_48_X | Translation | 0.000125291419464542 | 3.68166969147005 | 0.157894736842105 | 0.0428867199053534 | HL-48 Xylose | |
| Commensalism Increase | |||||||
| HL_48_58_X_v_HL_48_X | Transcription | 0.00116854340692983 | 9.8859649122807 | 0.0526315789473684 | 0.00532386867790594 | HL-48 Xylose | |
| Commensalism Increase | |||||||
| HL_48_58_G_v_HL_58_G | Intracellular trafficking_ assembly_ and processing | 0.0264067225592166 | 1.82752593907784 | 0.0631970260223048 | 0.0345806451612903 | HL-58 Glucose | |
| Competition Increase | |||||||
| HL_48_58_G_v_HL_58_G | Transport and binding proteins | 0.0137794066704247 | 1.618562298985 | 0.111524163568773 | 0.0689032258064516 | HL-58 Glucose | |
| Competition Increase | |||||||
| HL_48_58_G_v_HL_58_G | Carbohydrate metabolism | 0.0391890775576213 | 1.92069392812887 | 0.0446096654275093 | 0.0232258064516129 | HL-58 Glucose | |
| Competition Increase | |||||||
| HL_48_58_G_v_HL_58_G | Nucleic acid metabolism | 0.00706118571573953 | 2.26598721857901 | 0.0520446096654275 | 0.0229677419354839 | HL-58 Glucose | |
| Competition Increase | |||||||
| HL_48_58_G_v_HL_48_G | 0.0270972329550317 | 1.71859237536657 | 0.52 | 0.302573203194321 | HL-48 Glucose | ||
| Competition Decrease | |||||||
| HL_48_58_X_v_HL_48_X | Metabolism of other amino acids and amines | 0.0244267716725998 | 9.11320754716981 | 0.0377358490566038 | 0.0041407867494824 | HL-48 Xylose | |
| Commensalism Decrease | |||||||
| HL_48_58_G_v_HL_58_G | 6.67985051333306e-08 | 1.48494842015754 | 0.515037593984962 | 0.346838709677419 | HL-58 Glucose | ||
| Competition Decrease | |||||||
| HL_48_58_G_v_HL_58_G | Cell motility and adherance | 0.0303716770237368 | 2.33082706766917 | 0.0300751879699248 | 0.0129032258064516 | HL-58 Glucose | |
| Competition Decrease |
MR <- data.frame(dir = MR.FE$dir, Treatment = MR.FE$Treatment, Main_Role = MR.FE$Main_Role,
Ratio = as.numeric(MR.FE$Ratio), PercentageInModule = 100*(as.numeric(MR.FE$PercentageInModule)))
MR$Ratio %>% summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 4 7 7 10 13
g.MR.FE <- ggplot(MR, aes(x = Treatment, y = Main_Role, size = Ratio, fill = PercentageInModule))
g.MR.FE <- g.MR.FE +
#geom_point(shape = 21, colour = "#000000", fill = "#40b8d0") +
geom_point(shape = 21) +
facet_grid(~dir) +
ggtitle("Functional enrichment of gene main roles") +
labs(x = "Treatment", y = "")
g.MR.FE
FE calculations on main role categories
#first, set up the subrole FE function
#input x is a FE input e.g. HL48.comp.FE
#input y is an annotation file e.g. an.48
subroleModuleEnrichment <- function(x,y)
{
y[grepl('::', y$Subrole), 'Subrole'] <- 'Ambiguous_Function'
fModuleData <- x
fAnnotData <- y[, c("GeneID", "Subrole")]
colnames(fAnnotData) <- c("GeneID", "Subrole")
uniqueFunCats <- unique(fAnnotData[c("Subrole")])
subRole <- unique(fAnnotData$Subrole)
modules <- unique(fModuleData$ModuleID)
numGenesInGenome <- nrow(fAnnotData)
outputData <- NULL
for (mID in modules)
{
genesInSet <- fModuleData[fModuleData$ModuleID == mID, "GeneID"]
numGenesInSet <- length(genesInSet)
for (i in 1:nrow(uniqueFunCats))
{
subRole <- uniqueFunCats[i, "Subrole"]
genesInGenomeWithAnnot <- fAnnotData[fAnnotData$Subrole == subRole, "GeneID"]
numGenesInGenomeWithAnnot <- length(genesInGenomeWithAnnot)
numGenesInSetWithAnnot <- length(intersect(genesInSet, genesInGenomeWithAnnot))
#=====================================================================================
# Run Fisher's exact test
counts <- matrix(c(numGenesInSetWithAnnot, numGenesInSet-numGenesInSetWithAnnot,
numGenesInGenomeWithAnnot, numGenesInGenome-numGenesInGenomeWithAnnot), nrow=2)
res <- fisher.test(counts)
if (res$p.value <= 0.05)
{
pModule <- numGenesInSetWithAnnot/numGenesInSet
pGenome <- numGenesInGenomeWithAnnot/numGenesInGenome
ratio <- pModule/pGenome
if (pModule > pGenome)
{
outputData <- rbind(outputData, cbind(ModuleID=mID, Subrole=subRole, PVal=res$p.value, Ratio=ratio, PercentageInModule=pModule, PercentageInGenome=pGenome))
}
}
}
}
outputData <- data.frame(outputData)
outputData
}
#subrolefunctial enrichment function using HL48.comp.FE and an.48 as x and y inputs
HL48.comp.SR.FE.U <- subroleModuleEnrichment(HL48.comp.FE.U, an.48)
HL48.comp.SR.FE.U$Treatment <- 'HL-48 Glucose\nCompetition'
HL48.comp.SR.FE.U$dir <- "Increase"
#subrolefunctial enrichment function using HL48.cmns.FE and an.48 as x and y inputs
HL48.cmns.SR.FE.U <- subroleModuleEnrichment(HL48.cmns.FE.U, an.48)
HL48.cmns.SR.FE.U$Treatment <- 'HL-48 Xylose\nCommensalism'
HL48.cmns.SR.FE.U$dir <- "Increase"
#subrolefunctial enrichment function using HL58.comp.FE and an.58 as x and y inputs
HL58.comp.SR.FE.U <- subroleModuleEnrichment(HL58.comp.FE.U, an.58)
HL58.comp.SR.FE.U$Treatment <- 'HL-58 Glucose\nCompetition'
HL58.comp.SR.FE.U$dir <- "Increase"
# matching set for reduced (D Down) enrichment
HL48.comp.SR.FE.D <- subroleModuleEnrichment(HL48.comp.FE.D, an.48)
HL48.comp.SR.FE.D$Treatment <- 'HL-48 Glucose\nCompetition'
HL48.comp.SR.FE.D$dir <- "Decrease"
HL48.cmns.SR.FE.D <- subroleModuleEnrichment(HL48.cmns.FE.D, an.48)
HL48.cmns.SR.FE.D$Treatment <- 'HL-48 Xylose\nCommensalism'
HL48.cmns.SR.FE.D$dir <- "Decrease"
HL58.comp.SR.FE.D <- subroleModuleEnrichment(HL58.comp.FE.D, an.58)
HL58.comp.SR.FE.D$Treatment <- 'HL-58 Glucose\nCompetition'
HL58.comp.SR.FE.D$dir <- "Decrease"
#pull it together
SR.FE <- rbind(HL48.comp.SR.FE.U, HL48.cmns.SR.FE.U, HL58.comp.SR.FE.U,
HL48.comp.SR.FE.D, HL48.cmns.SR.FE.D, HL58.comp.SR.FE.D)
kable(SR.FE, caption = "Functional enrichment of subrole gene categories")
| ModuleID | Subrole | PVal | Ratio | PercentageInModule | PercentageInGenome | Treatment | dir |
|---|---|---|---|---|---|---|---|
| HL_48_58_G_v_HL_48_G | Carbohydrates_ organic alcohols_ and acids | 0.00248325134199355 | 26.8333333333333 | 0.285714285714286 | 0.0106477373558119 | HL-48 Glucose | |
| Competition Increase | |||||||
| HL_48_58_G_v_HL_48_G | Phenylalanine_ tyrosine and tryptophan biosynthesis | 0.0366344810198446 | 28.4117647058824 | 0.142857142857143 | 0.00502809819580006 | HL-48 Glucose | |
| Competition Increase | |||||||
| HL_48_58_G_v_HL_48_G | Pyruvate metabolism | 0.0184637316007577 | 60.375 | 0.142857142857143 | 0.00236616385684709 | HL-48 Glucose | |
| Competition Increase | |||||||
| HL_48_58_X_v_HL_48_X | Protein export_ secretion_ and sorting | 0.016590769967938 | 6.06638755980861 | 0.0394736842105263 | 0.00650695060632949 | HL-48 Xylose | |
| Commensalism Increase | |||||||
| HL_48_58_X_v_HL_48_X | Cell division | 0.000522576883149083 | 12.7105263157895 | 0.0526315789473684 | 0.0041407867494824 | HL-48 Xylose | |
| Commensalism Increase | |||||||
| HL_48_58_X_v_HL_48_X | Transcription factors | 0.0469103929027543 | 6.35526315789474 | 0.0263157894736842 | 0.0041407867494824 | HL-48 Xylose | |
| Commensalism Increase | |||||||
| HL_48_58_X_v_HL_48_X | Oxidative phosphorylation | 0.00128086052095729 | 6.74043062200957 | 0.0657894736842105 | 0.00976042590949423 | HL-48 Xylose | |
| Commensalism Increase | |||||||
| HL_48_58_X_v_HL_48_X | Chromosome and assoicated proteins | 0.0272987844560061 | 8.89736842105263 | 0.0263157894736842 | 0.00295770482105886 | HL-48 Xylose | |
| Commensalism Increase | |||||||
| HL_48_58_X_v_HL_48_X | Ribosomal proteins: synthesis and modification | 8.62947452254726e-05 | 6.0321141837645 | 0.105263157894737 | 0.0174504584442473 | HL-48 Xylose | |
| Commensalism Increase | |||||||
| HL_48_58_X_v_HL_48_X | RNA polymerase | 0.00675724604672468 | 22.2434210526316 | 0.0263157894736842 | 0.00118308192842354 | HL-48 Xylose | |
| Commensalism Increase | |||||||
| HL_48_58_G_v_HL_58_G | Peptidoglycan metabolism | 0.00174061263396245 | 4.1157727031333 | 0.0297397769516729 | 0.0072258064516129 | HL-58 Glucose | |
| Competition Increase | |||||||
| HL_48_58_G_v_HL_58_G | Protein export_ secretion_ and sorting | 0.0369465840600951 | 2.40086741016109 | 0.0260223048327138 | 0.0108387096774194 | HL-58 Glucose | |
| Competition Increase | |||||||
| HL_48_58_G_v_HL_58_G | Pentose phosphate pathway | 0.0077778747775878 | 6.40231309376291 | 0.0148698884758364 | 0.00232258064516129 | HL-58 Glucose | |
| Competition Increase | |||||||
| HL_48_58_G_v_HL_58_G | Amino acids_ peptides and amines | 0.0257608076700826 | 4.1157727031333 | 0.0148698884758364 | 0.00361290322580645 | HL-58 Glucose | |
| Competition Increase | |||||||
| HL_48_58_G_v_HL_58_G | Citrate cycle (TCA cycle) | 0.0141938924524084 | 4.00144568360182 | 0.0185873605947955 | 0.00464516129032258 | HL-58 Glucose | |
| Competition Increase | |||||||
| HL_48_58_G_v_HL_48_G | 0.0270972329550317 | 1.71859237536657 | 0.52 | 0.302573203194321 | HL-48 Glucose | ||
| Competition Decrease | |||||||
| HL_48_58_G_v_HL_48_G | General | 0.0372163936778414 | 3.14511627906977 | 0.16 | 0.0508725229222124 | HL-48 Glucose | |
| Competition Decrease | |||||||
| HL_48_58_X_v_HL_48_X | Small molecule interactions | 0.0277507186204185 | 4.78443396226415 | 0.0566037735849057 | 0.0118308192842354 | HL-48 Xylose | |
| Commensalism Decrease | |||||||
| HL_48_58_X_v_HL_48_X | Alanine_ aspartate and glutamate metabolism | 0.016352582304608 | 11.598627787307 | 0.0377358490566038 | 0.00325347530316474 | HL-48 Xylose | |
| Commensalism Decrease | |||||||
| HL_48_58_X_v_HL_48_X | aliphatic and aromatic amines | 0.0456038265878301 | 31.8962264150943 | 0.0188679245283019 | 0.000591540964211772 | HL-48 Xylose | |
| Commensalism Decrease | |||||||
| HL_48_58_G_v_HL_58_G | 6.67985051333306e-08 | 1.48494842015754 | 0.515037593984962 | 0.346838709677419 | HL-58 Glucose | ||
| Competition Decrease | |||||||
| HL_48_58_G_v_HL_58_G | Ubiquinone and other terpenoid-quinone biosynthesis | 0.0373922695734922 | 4.85588972431078 | 0.0112781954887218 | 0.00232258064516129 | HL-58 Glucose | |
| Competition Decrease | |||||||
| HL_48_58_G_v_HL_58_G | Secretion systems | 0.0203596330863447 | 4.48235974551764 | 0.0150375939849624 | 0.00335483870967742 | HL-58 Glucose | |
| Competition Decrease | |||||||
| HL_48_58_G_v_HL_58_G | Flagella complex and associated proteins | 0.0103865233371454 | 2.91353383458647 | 0.0300751879699248 | 0.0103225806451613 | HL-58 Glucose | |
| Competition Decrease | |||||||
| HL_48_58_G_v_HL_58_G | Iron sulfur clusters | 0.000883817191067721 | 9.10479323308271 | 0.018796992481203 | 0.00206451612903226 | HL-58 Glucose | |
| Competition Decrease |
SR <- data.frame(dir = SR.FE$dir, Treatment = SR.FE$Treatment, Subrole = SR.FE$Subrole,
Ratio = as.numeric(SR.FE$Ratio), PercentageInModule = 100*(as.numeric(SR.FE$PercentageInModule)))
SR$Ratio %>% summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 13.00 12.52 18.00 24.00
g.SR.FE <- ggplot(SR, aes(x = Treatment, y = Subrole, size = Ratio, fill = PercentageInModule))
g.SR.FE <- g.SR.FE +
facet_grid(~dir) +
#geom_point(shape = 21, colour = "#000000", fill = "#40b8d0") +
geom_point(shape = 21) +
ggtitle("Functional enrichment of gene subroles") +
labs(x = "", y = "")
g.SR.FE
names(MR)[3] <- "Main Role"
names(SR)[3] <- "Subrole"
head(MR)
## dir Treatment
## 1 Increase HL-48 Glucose\nCompetition
## 2 Increase HL-48 Xylose\nCommensalism
## 3 Increase HL-48 Xylose\nCommensalism
## 4 Increase HL-48 Xylose\nCommensalism
## 5 Increase HL-48 Xylose\nCommensalism
## 6 Increase HL-58 Glucose\nCompetition
## Main Role Ratio
## 1 Transport and binding proteins 1
## 2 Cell structure_ growth_ and death 4
## 3 Energy metabolism 2
## 4 Translation 3
## 5 Transcription 5
## 6 Intracellular trafficking_ assembly_ and processing 7
## PercentageInModule
## 1 100
## 2 400
## 3 300
## 4 500
## 5 200
## 6 800
head(SR)
## dir Treatment
## 1 Increase HL-48 Glucose\nCompetition
## 2 Increase HL-48 Glucose\nCompetition
## 3 Increase HL-48 Glucose\nCompetition
## 4 Increase HL-48 Xylose\nCommensalism
## 5 Increase HL-48 Xylose\nCommensalism
## 6 Increase HL-48 Xylose\nCommensalism
## Subrole Ratio
## 1 Carbohydrates_ organic alcohols_ and acids 1
## 2 Phenylalanine_ tyrosine and tryptophan biosynthesis 2
## 3 Pyruvate metabolism 3
## 4 Protein export_ secretion_ and sorting 7
## 5 Cell division 4
## 6 Transcription factors 8
## PercentageInModule
## 1 200
## 2 100
## 3 100
## 4 400
## 5 500
## 6 300
# melt then combine into Enriched Roles
mr.melt <- melt(MR, measure.vars = "Main Role")
sr.melt <- melt(SR, measure.vars = "Subrole")
head(mr.melt)
## dir Treatment Ratio PercentageInModule variable
## 1 Increase HL-48 Glucose\nCompetition 1 100 Main Role
## 2 Increase HL-48 Xylose\nCommensalism 4 400 Main Role
## 3 Increase HL-48 Xylose\nCommensalism 2 300 Main Role
## 4 Increase HL-48 Xylose\nCommensalism 3 500 Main Role
## 5 Increase HL-48 Xylose\nCommensalism 5 200 Main Role
## 6 Increase HL-58 Glucose\nCompetition 7 800 Main Role
## value
## 1 Transport and binding proteins
## 2 Cell structure_ growth_ and death
## 3 Energy metabolism
## 4 Translation
## 5 Transcription
## 6 Intracellular trafficking_ assembly_ and processing
head(sr.melt)
## dir Treatment Ratio PercentageInModule variable
## 1 Increase HL-48 Glucose\nCompetition 1 200 Subrole
## 2 Increase HL-48 Glucose\nCompetition 2 100 Subrole
## 3 Increase HL-48 Glucose\nCompetition 3 100 Subrole
## 4 Increase HL-48 Xylose\nCommensalism 7 400 Subrole
## 5 Increase HL-48 Xylose\nCommensalism 4 500 Subrole
## 6 Increase HL-48 Xylose\nCommensalism 8 300 Subrole
## value
## 1 Carbohydrates_ organic alcohols_ and acids
## 2 Phenylalanine_ tyrosine and tryptophan biosynthesis
## 3 Pyruvate metabolism
## 4 Protein export_ secretion_ and sorting
## 5 Cell division
## 6 Transcription factors
el.melt <- rbind(mr.melt, sr.melt)
# rename treatments
el.melt$Treatment <- el.melt$Treatment %>% factor(labels =
c("HL-48 \nGlucose\nCompetition", "HL-48 \nXylose\nCommensalism", "HL-58 \nGlucose\nCompetition"))
# rename treatments (without mentioning sugar type)
#el.melt$Treatment <- el.melt$Treatment %>% factor(labels = c("HL-48 \nCompetition", "HL-48 \nCommensalism", "HL-58 \nCompetition"))
# reorder levels
el.melt$Treatment <- factor(el.melt$Treatment, levels(el.melt$Treatment)[c(1,3,2)])
# Remove underscore
el.melt$value <- gsub("_", "", fixed = T, x = el.melt$value)
# Manually edit y-axis elements in this graph. Easy way to change text text
# pico(el.melt$value) # Get text
# el.melt$value <- c("the string"")
# Also remove the unidentified item
el.melt <- el.melt %>% subset(value != "")
g.el.FE <- ggplot(el.melt, aes(x = Treatment, y = value, size = Ratio, fill = PercentageInModule))
g.el.FE + geom_point(shape = 21) + labs(x = "", y = "") + facet_grid(variable~dir, scales = "free_y", space = "free") +
scale_size_continuous(range = c(2, 10)) + scale_fill_viridis(option = "B", begin = .1, end = .9) +
theme(
#legend.position = c(-0.9, -0.12), legend.direction = "horizontal", legend.box = "vertical", plot.margin = unit(c(10, 5.5, 30, 5.5), "points"),
legend.position = c(-0.5, -0.10), legend.direction = "horizontal", legend.box = "vertical", plot.margin = unit(c(10, 5.5, 25, 5.5), "points"),
strip.background = element_blank(), strip.text = element_text(size = 12), legend.spacing.y = unit(0, "points"),
axis.text.x = element_text(angle = -30, hjust=0, vjust = 1, size = 10)) # angle of 30
ggsave("figures/fig2.pdf", width = 120, height = 120, units = "mm", scale = 1.5)
# Input data sets
HL48.diff.coculture.f <- HL48.diff.coculture %>% subset(padj <= 0.05)
HL58.diff.coculture.f <- HL58.diff.coculture %>% subset(padj <= 0.05)
HL58.diff.proxy.f <- HL58.diff.proxy %>% subset(padj <= 0.05)
HL48.diff.coculture.U <- HL48.diff.coculture.f %>% subset(log2FoldChange >= 1)
HL58.diff.coculture.U <- HL58.diff.coculture.f %>% subset(log2FoldChange >= 1)
HL58.diff.proxy.U <- HL58.diff.proxy.f %>% subset(log2FoldChange >= 1)
HL48.diff.coculture.D <- HL48.diff.coculture.f %>% subset(log2FoldChange <= -1)
HL58.diff.coculture.D <- HL58.diff.coculture.f %>% subset(log2FoldChange <= -1)
HL58.diff.proxy.D <- HL58.diff.proxy.f %>% subset(log2FoldChange <= -1)
#prepare input files for FE
HL48.diff.coculture.U.FE <- HL48.diff.coculture.U %>% select(GeneID) %>% data.frame(., ModuleID = "test")
HL58.diff.coculture.U.FE <- HL58.diff.coculture.U %>% select(GeneID) %>% data.frame(., ModuleID = "test2")
HL58.diff.proxy.U.FE <- HL58.diff.proxy.U %>% select(GeneID) %>% data.frame(., ModuleID = "test3")
HL48.diff.coculture.D.FE <- HL48.diff.coculture.D %>% select(GeneID) %>% data.frame(., ModuleID = "test4")
HL58.diff.coculture.D.FE <- HL58.diff.coculture.D %>% select(GeneID) %>% data.frame(., ModuleID = "test5")
HL58.diff.proxy.D.FE <- HL58.diff.proxy.D %>% select(GeneID) %>% data.frame(., ModuleID = "test6")
# main role
# Up
HL48.diff.coculture.U.FE.MR <- mainroleeModuleEnrichment(HL48.diff.coculture.U.FE, an.48)
HL48.diff.coculture.U.FE.MR$Treatment <- 'HL-48 Coculture'
HL48.diff.coculture.U.FE.MR$dir <- "Increase"
HL58.diff.coculture.U.FE.MR <- mainroleeModuleEnrichment(HL58.diff.coculture.U.FE, an.58)
HL58.diff.coculture.U.FE.MR$Treatment <- 'HL-58 Coculture'
HL58.diff.coculture.U.FE.MR$dir <- "Increase"
HL58.diff.proxy.U.FE.MR <- mainroleeModuleEnrichment(HL58.diff.proxy.U.FE, an.58)
HL58.diff.proxy.U.FE.MR$Treatment <- 'HL-58 Proxy'
HL58.diff.proxy.U.FE.MR$dir <- "Increase"
# down
HL48.diff.coculture.D.FE.MR <- mainroleeModuleEnrichment(HL48.diff.coculture.D.FE, an.48)
#HL48.diff.coculture.D.FE.MR$Treatment <- 'HL48 Coculture' # empty
#HL48.diff.coculture.D.FE.MR$dir <- "Decrease" # empty
HL58.diff.coculture.D.FE.MR <- mainroleeModuleEnrichment(HL58.diff.coculture.D.FE, an.58)
HL58.diff.coculture.D.FE.MR$Treatment <- 'HL-58 Coculture'
HL58.diff.coculture.D.FE.MR$dir <- "Decrease"
HL58.diff.proxy.D.FE.MR <- mainroleeModuleEnrichment(HL58.diff.proxy.D.FE, an.58)
HL58.diff.proxy.D.FE.MR$Treatment <- 'HL-58 Proxy'
HL58.diff.proxy.D.FE.MR$dir <- "Decrease"
#pull it together
new.MR.FE <- rbind(HL48.diff.coculture.U.FE.MR, HL58.diff.coculture.U.FE.MR, HL58.diff.proxy.U.FE.MR,
HL48.diff.coculture.D.FE.MR, HL58.diff.coculture.D.FE.MR, HL58.diff.proxy.D.FE.MR)
#kable(new.MR.FE) #caption = "Functional enrichment of main role gene categories")
# Sub role
# Up
HL48.diff.coculture.U.FE.SR <- subroleModuleEnrichment(HL48.diff.coculture.U.FE, an.48)
HL48.diff.coculture.U.FE.SR$Treatment <- 'HL-48 Coculture'
HL48.diff.coculture.U.FE.SR$dir <- "Increase"
HL58.diff.coculture.U.FE.SR <- subroleModuleEnrichment(HL58.diff.coculture.U.FE, an.58)
HL58.diff.coculture.U.FE.SR$Treatment <- 'HL-58 Coculture'
HL58.diff.coculture.U.FE.SR$dir <- "Increase"
HL58.diff.proxy.U.FE.SR <- subroleModuleEnrichment(HL58.diff.proxy.U.FE, an.58)
HL58.diff.proxy.U.FE.SR$Treatment <- 'HL-58 Proxy'
HL58.diff.proxy.U.FE.SR$dir <- "Increase"
# down
HL48.diff.coculture.D.FE.SR <- subroleModuleEnrichment(HL48.diff.coculture.D.FE, an.48)
HL48.diff.coculture.D.FE.SR$Treatment <- 'HL-48 Coculture'
HL48.diff.coculture.D.FE.SR$dir <- "Decrease"
HL58.diff.coculture.D.FE.SR <- subroleModuleEnrichment(HL58.diff.coculture.D.FE, an.58)
HL58.diff.coculture.D.FE.SR$Treatment <- 'HL-58 Coculture'
HL58.diff.coculture.D.FE.SR$dir <- "Decrease"
HL58.diff.proxy.D.FE.SR <- subroleModuleEnrichment(HL58.diff.proxy.D.FE, an.58)
HL58.diff.proxy.D.FE.SR$Treatment <- 'HL-58 Proxy'
HL58.diff.proxy.D.FE.SR$dir <- "Decrease"
#pull it together
new.SR.FE <- rbind(HL48.diff.coculture.U.FE.SR, HL58.diff.coculture.U.FE.SR, HL58.diff.proxy.U.FE.SR,
HL48.diff.coculture.D.FE.SR, HL58.diff.coculture.D.FE.SR, HL58.diff.proxy.D.FE.SR)
#kable(new.SR.FE) #caption = "Functional enrichment of sub-role gene categories")
new.MR <- data.frame(dir = new.MR.FE$dir, Treatment = new.MR.FE$Treatment, Main_role = new.MR.FE$Main_Role,
Ratio = as.numeric(new.MR.FE$Ratio), PercentageInModule = 100*(as.numeric(new.MR.FE$PercentageInModule)))
new.SR <- data.frame(dir = new.SR.FE$dir, Treatment = new.SR.FE$Treatment, Subrole = new.SR.FE$Subrole,
Ratio = as.numeric(new.SR.FE$Ratio), PercentageInModule = 100*(as.numeric(new.SR.FE$PercentageInModule)))
names(new.MR)[3] <- "Main Role"
names(new.SR)[3] <- "Subrole"
head(new.MR)
## dir Treatment Main Role Ratio
## 1 Increase HL-48 Coculture Carbohydrate metabolism 1
## 2 Increase HL-58 Coculture Carbohydrate metabolism 2
## 3 Increase HL-58 Proxy Xenobiotics biodegradation and metabolism 4
## 4 Increase HL-58 Proxy Fatty acid and lipid metabolism 3
## 5 Decrease HL-58 Coculture Xenobiotics biodegradation and metabolism 8
## 6 Decrease HL-58 Coculture 5
## PercentageInModule
## 1 100
## 2 200
## 3 300
## 4 400
## 5 500
## 6 700
head(new.SR)
## dir Treatment Subrole
## 1 Increase HL-48 Coculture Polysacharide and lipopolysaccharide metabolism
## 2 Increase HL-48 Coculture Invasion response
## 3 Increase HL-48 Coculture Glycolysis / Gluconeogenesis
## 4 Increase HL-48 Coculture Pyruvate metabolism
## 5 Increase HL-48 Coculture Butanoate metabolism
## 6 Increase HL-48 Coculture Anions
## Ratio PercentageInModule
## 1 2 400
## 2 5 300
## 3 3 300
## 4 4 100
## 5 1 100
## 6 7 500
# melt then combine into Enriched Roles
new.mr.melt <- melt(new.MR, measure.vars = "Main Role")
new.sr.melt <- melt(new.SR, measure.vars = "Subrole")
head(new.mr.melt)
## dir Treatment Ratio PercentageInModule variable
## 1 Increase HL-48 Coculture 1 100 Main Role
## 2 Increase HL-58 Coculture 2 200 Main Role
## 3 Increase HL-58 Proxy 4 300 Main Role
## 4 Increase HL-58 Proxy 3 400 Main Role
## 5 Decrease HL-58 Coculture 8 500 Main Role
## 6 Decrease HL-58 Coculture 5 700 Main Role
## value
## 1 Carbohydrate metabolism
## 2 Carbohydrate metabolism
## 3 Xenobiotics biodegradation and metabolism
## 4 Fatty acid and lipid metabolism
## 5 Xenobiotics biodegradation and metabolism
## 6
head(new.sr.melt)
## dir Treatment Ratio PercentageInModule variable
## 1 Increase HL-48 Coculture 2 400 Subrole
## 2 Increase HL-48 Coculture 5 300 Subrole
## 3 Increase HL-48 Coculture 3 300 Subrole
## 4 Increase HL-48 Coculture 4 100 Subrole
## 5 Increase HL-48 Coculture 1 100 Subrole
## 6 Increase HL-48 Coculture 7 500 Subrole
## value
## 1 Polysacharide and lipopolysaccharide metabolism
## 2 Invasion response
## 3 Glycolysis / Gluconeogenesis
## 4 Pyruvate metabolism
## 5 Butanoate metabolism
## 6 Anions
new.el.melt <- rbind(new.mr.melt, new.sr.melt)
# rename treatments. (Maybe not needed based on my new names for 'modules')
# new.el.melt$Treatment <- el.melt$Treatment %>% factor(labels = c("HL48 \nGlucose\nCompetition", "HL48 \nXylose\nCommensalism", "HL58 \nGlucose\nCompetition"))
# Remove underscore
new.el.melt$value <- gsub("_", "", fixed = T, x = new.el.melt$value)
# Manually edit y-axis elements in this graph. Easy way to change text text
# pico(el.melt$value) # Get text
# el.melt$value <- c("the string"")
# Also remove the unidentified item
new.el.melt <- new.el.melt %>% subset(value != "")
g.new.el.FE <- ggplot(new.el.melt, aes(x = Treatment, y = value, size = Ratio, fill = PercentageInModule))
g.new.el.FE + geom_point(shape = 21) + labs(x = "", y = "") + facet_grid(variable~dir, scales = "free_y", space = "free") +
scale_size_continuous(range = c(2, 10)) + scale_fill_viridis(option = "B", begin = .1, end = .9) +
theme(
#legend.position = c(-0.9, -0.12), legend.direction = "horizontal", legend.box = "vertical", plot.margin = unit(c(10, 5.5, 30, 5.5), "points"),
legend.position = c(-0.5, -0.10), legend.direction = "horizontal", legend.box = "vertical", plot.margin = unit(c(10, 10, 35, 5.5), "points"),
strip.background = element_blank(), strip.text = element_text(size = 12), legend.spacing.y = unit(0, "points"),
axis.text.x = element_text(angle = -20, hjust=0, vjust = 1, size = 10))
ggsave("figures/sub-FE-new.pdf", width = 114, height = 140, units = "mm", scale = 1.5)
#Save and exit.
knitr::knit_exit(F)
FALSE